#!/usr/bin/env bash
# deepdive-unload -- Unloads a given relation's data
# > deepdive unload RELATION[(COLUMN[,COLUMN]...)] [SINK...]
# Unloads given RELATION's data from the database to given SINKs for
# optionally specified COLUMNs.  When SINK is unspecified, the data is unloaded
# to input/RELATION.* under the DeepDive application.
#
# When RELATION is a random variable and no COLUMN is specified, the unloaded
# data will contain the `label` column at the end, in addition to all
# user-defined ones.
#
# This command must be run under an environment whose DEEPDIVE_DB_URL variable
# is set to a proper URL, or under a DeepDive application where the URL is set
# in the db.url file.
#
# The format of each SINK is determined by the filename extension, such as `.tsj`
# `.tsv` or `.csv`.  Defining the DEEPDIVE_LOAD_FORMAT environment to tsj, tsv, csv,
# or sql assumes a particular format, ignoring the filename.  Defining
# DEEPDIVE_LOAD_FORMAT_DEFAULT environment will fallback to the specified
# format when it cannot be determined from the filename.
#
# For example:
#
# > deepdive unload sentences
# unloads table 'sentences' to input/sentences.tsj.
#
# > deepdive unload sentences path/to/sentences.tsj
# unloads table 'sentences' to path/to/sentences.tsj.
#
# > deepdive unload sentences sentences.csv.bz2
# unloads table 'sentences' to a compressed CSV file.
##
set -eu

# fallback to a data format or skip detection of it and assume a particular one
: ${DEEPDIVE_LOAD_FORMAT_DEFAULT:=} ${DEEPDIVE_LOAD_FORMAT:=}

[[ $# -gt 0 ]] || usage "$0" "Missing RELATION to load"
Relation=$1; shift
Columns=

# parse optional columns following relation name
case $Relation in
    *"("*")")
        # keep column names separate
        Columns="${Relation#*"("}"
        Columns=${Columns%")"}
        Relation=${Relation%%"("*}
esac

# when no SINK is specified, look under input/
if [[ $# -eq 0 ]]; then
    DEEPDIVE_APP=$(find-deepdive-app)
    export DEEPDIVE_APP
    . load-db-driver.sh

    cd "$DEEPDIVE_APP"
    # search under input/RELATION.*
    for Path in input/"$Relation".{tsj,tsv,csv}{,.bz2,.gz}; do
        [[ -e "$Path" ]] || continue
        break
    done
    [[ -e "$Path" ]] ||
        Path="input/$Relation.${DEEPDIVE_LOAD_FORMAT:-${DEEPDIVE_LOAD_FORMAT_DEFAULT:-tsj}}"
    # use the found path
    set -- "$Path"
else
    # rely on the app if found (optional)
    DEEPDIVE_APP=$(find-deepdive-app 2>/dev/null) || true
    export DEEPDIVE_APP
    . load-db-driver.sh
fi

# find the columns to load (when no $Columns were explicitly specified and
# $Relation is a random variable, load the user-defined columns with the
# internal label column but nothing else)
if [[ -z "$Columns" && -e "$DEEPDIVE_APP" ]] && app-has-been-compiled; then
    Columns=$(
        cd "$DEEPDIVE_APP"
        Relation=$Relation \
        jq -r '
            .deepdive_.schema.relations[env.Relation] |
            if .variable_type then
                .columns | to_entries |
                sort_by(.value.index) |
                map(.key)+["label"] | join(",")
            else
                empty
            end
        ' run/compiled/config.json
    )
fi

# how to unload to a set of sink expressions
# (eval is used here to support process substitution for on-the-fly compression)
currentFormat= pathsInFormat=() sinksInFormat=
unloadQueued() {
    [[ -n ${currentFormat:=$DEEPDIVE_LOAD_FORMAT_DEFAULT} ]] ||
        error "Specify DEEPDIVE_LOAD_FORMAT= or DEEPDIVE_LOAD_FORMAT_DEFAULT="
    echo "Unloading $Relation${Columns:+($Columns)} to ${pathsInFormat[*]:-/dev/stdout} ($currentFormat format)"
    eval 'db-unload "SELECT ${Columns:-"*"} FROM $Relation" "$currentFormat" '"$sinksInFormat"
}
queue() {
    pathsInFormat+=("$1")
    sinksInFormat+="$2 "
}

# assume a particular format when explicitly specified
currentFormat=$DEEPDIVE_LOAD_FORMAT

# use USR1 to communicate failure from process substitutions: bzip2, gzip, or custom data sink executables (in the future)
trap false USR1

# group given paths by format, and batch call to unload to support parallel unload
format=$currentFormat
for path; do
    # determine the format
    if [[ -z "$DEEPDIVE_LOAD_FORMAT" ]]; then
        case $path in
            *.tsj|*.tsj.*)           format=tsj      ;;
            *.tsv|*.tsv.*)           format=tsv      ;;
            *.csv|*.csv.*)           format=csv      ;;
            *)
                [[ -n "$DEEPDIVE_LOAD_FORMAT_DEFAULT" ]] ||
                    error "$path: Unrecognized format, specify DEEPDIVE_LOAD_FORMAT= or DEEPDIVE_LOAD_FORMAT_DEFAULT="
                format=$DEEPDIVE_LOAD_FORMAT_DEFAULT
        esac
    fi
    # and how to compress if needed
    case $path in
        *.bz2)
            if type pbzip2 &>/dev/null
            then compress=pbzip2
            else compress=bzip2
            fi ;;
        *.gz)
            if type pigz &>/dev/null
            then compress=pigz
            else compress=gzip
            fi ;;
        *)     compress=
    esac

    # unload once the format changes
    if [[ "$format" != "$currentFormat" ]]; then
        [[ -z "$currentFormat" ]] || unloadQueued
        currentFormat=$format pathsInFormat=() sinksInFormat=
    fi

    # queue a sink expression
    if [[ -n "$compress" ]]; then
        # wrapped with decompression process if needed
        queue "$path" ">($(escape4sh "$compress") >$(escape4sh "$path") || kill -USR1 $$)"
    else
        queue "$path" "$(escape4sh "$path")"
    fi
done

# unload everything queued so far
unloadQueued
